---
title: "COVID-19 Evacuation Study"
subtitle: "Replication Code"
authors: "Courtney Page-Tan and Timothy Fraser"
output: html_notebook
---

This study tests whether evacuation from severe disasters led to upticks in the spread of COVID. We estimate evacuation using data from Facebook Data for Good. Their project tallies up the number of Facebook users who were identified to be in the disaster hit area consistently two weeks prior to the disaster. Then, they tally up how many people went from any neighborhood in that region to any other neighborhood. Unfortunately, this data is measured at the neighborhood level, using special polgyons. This is not a level joinable with census data, and by data sharing agreement, the researchers cannot share the original raw data without permission from Facebook. So, to remedy both these issues, we aggregated everything up from the neighborhood level to the census tract level. This is described in detail in fb_network_maker_code.Rmd. While we can't share the original base data, located in the fb_data folder, we can share all other data for replication, which we describe below!

# 0. Packages

First, let's load some packages!

```{r, message = FALSE, warning = FALSE}
library(tidyverse) # for tidy data manipulation
# Using the most recent version of dplyr, we can bind_rows() for sf objects
# Download the most recent version here:
# library(remotes)
# remotes::install_github("tidyverse/dplyr")
library(dplyr)
library(dtplyr) # for dplyr functions with data.table, the speedy data manipulation package

# GIS packages
library(sf) # for tidy spatial data
library(rgdal) # for spatial operations
library(tigris) # for accessing census boundaries
library(tidycensus) # for downloading census data
library(censusapi) # for downloading census data
# Networks Packages
library(tidygraph)
library(ggraph)
#remotes::install_github("luukvdmeer/sfnetworks")
library(sfnetworks)
library(gstat)

# Multilevel Modeling
library(lme4)
library(lmerTest)

# Matching
library(MatchIt)
library(cem)
```

# 1. Building the Network

## 1.1 Southeastern Louisiana after Hurricane Zeta

```{r}
# Finally, let's convert this into a network object.
# This network object is in tidygraph, via the sfnetworks package;
# It is geocoded, and can be mapped.
# It is a directed network, weighted by the number of evacuees

# Import your network object
sfnetworks::sfnetwork(
  nodes = read_rds("shapes/csub_centroid.rds") %>%
    filter(str_sub(geoid, 1,2 ) == "22"), 
  edges = read_rds("raw_data/hurricane_zeta_edges_csub.rds"), 
  force = TRUE,
  directed = TRUE,
  node_key = "geoid", 
  edges_as_lines = TRUE,
  length_as_weight = FALSE) %>%
  # Now briefly transform the coordinate projection system to North American Equidistant Conic
    st_transform(crs = CRS(paste0("+proj=eqdc +lat_0=0 +lon_0=0 +lat_1=20 +lat_2=60 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"))) %>%
  activate("edges") %>%
  # And calculate the distance between tracts in km
  mutate(km = as.numeric(st_length(geometry, which = "Euclidean") / 1000)) %>%
  # Now transform back to WSG84
  st_transform(CRS(paste0("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"))) %>%
  saveRDS("raw_data/hurricane_zeta_la.rds")


# Get North American Equidistant Conic Projection
#https://spatialreference.org/ref/esri/102010/
aed <- "+proj=eqdc +lat_0=0 +lon_0=0 +lat_1=20 +lat_2=60 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"

remove(centroids, edgelist, neighborhoods, ids, states,counties)
```


## 1.2 California

```{r}
# Finally, let's convert this into a network object.
# This network object is in tidygraph, via the sfnetworks package;
# It is geocoded, and can be mapped.
# It is a directed network, weighted by the number of evacuees

# Import your network object
sfnetworks::sfnetwork(
  nodes = read_rds("shapes/csub_centroid.rds") %>%
    filter(str_sub(geoid, 1,2 ) == "06"), 
  edges = read_rds("raw_data/glass_fire_edges_csub.rds"), 
  force = TRUE,
  directed = TRUE,
  node_key = "geoid", 
  edges_as_lines = TRUE,
  length_as_weight = FALSE) %>%
  # Now briefly transform the coordinate projection system to North American Equidistant Conic
    st_transform(crs = CRS(paste0("+proj=eqdc +lat_0=0 +lon_0=0 +lat_1=20 +lat_2=60 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"))) %>%
  activate("edges") %>%
  # And calculate the distance between tracts in km
  mutate(km = as.numeric(st_length(geometry, which = "Euclidean") / 1000)) %>%
  # Now transform back to WSG84
  st_transform(CRS(paste0("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"))) %>%
  saveRDS("raw_data/glass_fire_ca.rds")


# Get North American Equidistant Conic Projection
#https://spatialreference.org/ref/esri/102010/
aed <- "+proj=eqdc +lat_0=0 +lon_0=0 +lat_1=20 +lat_2=60 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"

remove(centroids, edgelist, neighborhoods, ids, states,counties)
```





# 2. County Covariates

First, let's download land area data per county.

## 2.1 Area data

```{r, message = FALSE, warning = FALSE}
tigris::counties(year = "2018") %>%
  sf::st_as_sf() %>%
  as.data.frame() %>% ungroup() %>%
  select(-geometry) %>%
  select(fips = GEOID, area_land = ALAND) %>%
  # convert land area, which is measured in square meters, to square kilometers
  mutate(area_land = as.numeric(area_land) / 1000000) %>%
  write_csv("raw_data/covariates/area.csv")
```

## 2.2 Party Voteshare data

Second, let's download data on shares of the vote that went to specific parties between 2000 and 2016. (This study's COVID outcome data occurs prior to the 2020 presidential election.)

```{r mitdata}
# ImportCounty Presidential Election Results 2000-2016 Data
# from MIT Elections Data & Science Lab
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/VOQCHQ

read_csv("raw_data/covariates/MIT_countypres_2000_2016.csv") %>%
  filter(office == "President") %>%
  filter(party %in% c("democrat", "republican")) %>% 
  rename(fips = FIPS) %>%
  # Convert unique county id FIPS code from numeric to a five digit character code
  mutate(fips = str_pad(fips, width = 5, side = "left", pad = "0")) %>%
  # Aggregate voteshare by county, party, and year
  group_by(fips, party, year) %>%
    # Create the share of votes that went to each party per county-year election
  summarize(voteshare = sum(candidatevotes, na.rm = TRUE) / sum(totalvotes, na.rm = TRUE)) %>%
    # Pivot wider
  pivot_wider(
    id_cols = c(fips),
    names_from = c(party, year),
    names_sep = "_",
    values_from = voteshare) %>%
  write_csv("raw_data/covariates/county_elections.csv")
```


## 2.3 Google Mobility data

Third, let's get tallies of Google mobility data!

```{r}
# Specify import settings, to make sure read_csv() doesn't affect the file
settings = list(country_region_code = col_character(), 
                country_region = col_character(), 
                sub_region_1 = col_character(), 
                sub_region_2 = col_character(), 
                metro_area = col_character(), 
                iso_3166_2_code = col_character(),
                census_fips_code = col_character(),
                date = col_datetime(),
                col_double(), col_double(), 
                col_double(), col_double(), 
                col_double())

# Download US results
read_csv("https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv",
         col_types = settings) %>% 
  # Fiter to just actual county entries
  filter(country_region_code == "US",  !is.na(sub_region_2)) %>%
  # Remove unnecessary fields
  select(state = sub_region_1, county = sub_region_2, 
         fips = census_fips_code, date, 
         retail_recreation = retail_and_recreation_percent_change_from_baseline,
         grocery_pharmacy = grocery_and_pharmacy_percent_change_from_baseline,
         parks = parks_percent_change_from_baseline,
         transit = transit_stations_percent_change_from_baseline,
         residential = residential_percent_change_from_baseline,
         workplaces = workplaces_percent_change_from_baseline) %>%
  # get rid of the words "County" or "Parish" for easy joining with other datasets
  # mutate(county = county %>% str_remove(" County| Parish")) %>%
  # Turn date into a "Date" format for easy joining with datasets
  mutate(date = as.Date(date)) %>%
  # export to file
  saveRDS("raw_data/covariates/google_us.rds")

```


For every day in every county, let's estimate mean mobility rates from several days prior

95% of people with COVID develop symptoms between 2.1 and 11.1 days, with a median of 5-6 days. 

https://www.cdc.gov/coronavirus/2019-ncov/hcp/clinical-guidance-management-patients.html#:~:text=The%20incubation%20period%20for%20COVID,from%20exposure%20to%20symptoms%20onset.&text=One%20study%20reported%20that%2097.5,SARS%2DCoV%2D2%20infection.

Guan WJ, Ni ZY, Hu Y, et al. Clinical Characteristics of Coronavirus Disease 2019 in China. N Engl J Med. 2020 Apr 30;382:1708– doi:10.1056/NEJMoa2002032external icon.

Li Q, Guan X, Wu P, et al. Early Transmission Dynamics in Wuhan, China, of Novel Coronavirus-Infected Pneumonia. N Engl J Med. 2020 Mar 26;382:1199–207. doi:10.1056/nejmoa2001316external icon.

Lauer SA, Grantz KH, Bi Q, et al. The Incubation Period of Coronavirus Disease 2019 (COVID-19) From Publicly Reported Confirmed Cases: Estimation and Application. Ann Intern Med. 2020 May 5;172(9):577–82. doi:10.7326/M20-0504external icon.

So, let's average mobility rates between 2 and 11 days earlier.

```{r}
# Get the range of dates
daterange <- seq(from = as.Date("2020-02-15"),
    to = as.Date("2021-01-27"), by = "days") %>%
  as.data.frame() %>%
  rename(start = 1) %>%
  mutate(date = lead(start, 11)) %>%
  mutate(end = lag(date, 2)) %>%
  filter(!is.na(end)) %>%
  mutate(id = 1:n()) %>%
  select(id, date, start, end)

# I want a record for day T, that gives me the mean mobility for days t-2 to t-11.
```

```{r, message = FALSE, warning = FALSE}
get_mobility = function(data){
  # Print the number of rows we have processed
  print(data$id[1])
  
  read_rds("raw_data/covariates/google_us.rds") %>%
    select(-state, -county) %>%
    # Arrange like this: NYC 2020-02-01, NYC 2020-02-02, etc.
    arrange(fips, date) %>%
    # For each county,
    group_by(fips) %>%
     # Filter mobility records to just those on or 
    # after the start date of the period, 
    # and before the end date of the period.
    filter(date >= data$start,
           date < data$end) %>%
    # Pivot into fewer columns for tidy calculation
    pivot_longer(
      cols = -c(fips, date),
      names_to = "measure",
      values_to = "value", 
      values_drop_na = TRUE) %>%
    # For each county and type of mobility,
    group_by(fips, measure) %>%
    # Calculate the mean percent change in mobility during the period
    summarize(
      value = mean(value, na.rm = TRUE)) %>%
    # Droppining NaN values were no valid observations could be averaged
    filter(!is.nan(value)) %>%
    ungroup() %>%
    # and pivot back our into a wide matrix for easy reading,
    pivot_wider(
      id_cols = c(fips),
      names_from = measure,
      values_from = value) %>%
    # adding the start and end date of the period to the record
    mutate(
      # Date of COVID observation (t)
      date = data$date,
      # Start of date range analyzed (t-11)
      start = data$start,
      # End of date range analyzed (t-2)
      end = data$end) %>%
    return()
}

daterange %>%
  split(.$id) %>%
  map_dfr(~get_mobility(.)) %>%
  saveRDS("raw_data/covariates/average_google_mobility.rds")

```




## 2.4 ACS Risk Factors

Fourth, let's access ACS risk factors.

```{r}
# ACS Risk Factors and Health Risk data here
# https://www.census.gov/data/experimental-data-products/community-resilience-estimates.html

# Read details here:
# https://www2.census.gov/data/experimental-data-products/community-resilience-estimates/2020/technical-document.pdf

read_csv("https://www2.census.gov/data/experimental-data-products/community-resilience-estimates/2020/cre-2018-a11.csv") %>%
  saveRDS("raw_data/covariates/acs_risk.rds")


#The ACS Community Resilience Estimates combine 11 risk factors for each census tract, including (1) if the income-to-poverty ratio is above 1.3, (2) the number of single or zero caregiver households, (3) crowding defined as more than 0.75 persons per room or living in blocks with a population sentiy of 4,000 people per square mile or more; (4) communication barriers, defined as linguistically isolated or no one in a household over the age of 16 holding a high school diploma; (5) no employed persons, (6) persons reporting disability, (7) persons lacking health insurance coverage, (8) persons over age 65, (9) persons with a serious heart condition, (10) persons with diabetes, (11) persons with emphysema or asthma.

# We use the Community Resilience Estimate for having 1, 2, or 3 risks, which is a probability of having those risks.

# Download census tract estimates
read_rds("raw_data/covariates/acs_risk.rds") %>%
  filter(geo_level == "Tract") %>%
  select(geoid, contains("predrt")) %>%
  saveRDS("raw_data/covariates/acs_risk_tract.rds")

# Download county estimates
read_rds("raw_data/covariates/acs_risk.rds") %>%
  filter(geo_level == "County") %>%
  select(geoid, contains("predrt")) %>%
  saveRDS("raw_data/covariates/acs_risk_county.rds")
```


## 2.5 County Health Rankings

```{r, message = FALSE, warning = FALSE}
# Use this website
# https://www.countyhealthrankings.org/explore-health-rankings/rankings-data-documentation

read_csv("raw_data/covariates/chr_2020.csv") %>%
  # remove anything other than the raw measures
  dplyr::select(!matches("CI low|CI high|numerator|denominator")) %>%
  # remove extra variable names
  slice(-1) %>%
  # Remove state or country level summaries
  filter(`County FIPS Code` != "000") %>%
  # Make all variables numeric, except identifiers
  mutate_at(vars(-c(1:7)), (funs(. %>% as.numeric))) %>%
  # select variables
  dplyr::select(state = `State Abbreviation`, 
                fips = `5-digit FIPS Code`, county = Name,
         # Documentation available here:
         # https://www.countyhealthrankings.org/explore-health-rankings/measures-data-sources/2020-measures
         
         # Years of potential life lost before age 75 per 100,000 population (age-adjusted). (2016-2018)
         premature_death = `Premature death raw value`,  
         premature_death_black = `Premature death (Black)`,
         premature_death_hispanic = `Premature death (Hispanic)`,
         premature_death_white = `Premature death (White)`,
         # Percentage of adults reporting fair or poor health (age-adjusted) 2017
         poor_fair_health = `Poor or fair health raw value`, 
         # Average number of physically unhealthy days reported in past 30 days (age-adjusted). 2017
         poor_phys_health_days = `Poor physical health days raw value`,
         # Average number of mentally unhealthy days reported in past 30 days (age-adjusted) 2017
         poor_ment_health_days = `Poor mental health days raw value`,
         # Percentage of adults who are current smokers. 2017
         smoking = `Adult smoking raw value`, 
         # Percentage of the adult population (age 20 and older) 
         # that reports a body mass index (BMI) greater than or equal to 30 kg/m2. (2016)
         obesity = `Adult obesity raw value`,
         # Percentage of adults aged 20 and above with diagnosed diabetes. (2016)
         diabetes = `Diabetes prevalence raw value`, 
         # Index of factors that contribute to a healthy food environment,
         # from 0 (worst) to 10 (best). (2015-2017)
         food_env_index = `Food environment index raw value`,
         # Percentage of adults age 20 and over reporting no leisure-time physical activity. (2016)
         physical_inactivity = `Physical inactivity raw value`,
         # Percentage of population with adequate access to locations for physical activity. (2010, 2019)
         exercise_access = `Access to exercise opportunities raw value`,
         # Percentage of adults reporting binge or heavy drinking (2017)
         drinking_excessive = `Excessive drinking raw value`, 
         # Percentage of driving deaths with alcohol involvement. (2014-2018)
         alcohol_driving_deaths = `Alcohol-impaired driving deaths raw value`,
         # Number of newly diagnosed chlamydia cases per 100,000 population. (2017) 
         sexually_transmitted_infections = `Sexually transmitted infections raw value`,
         # Percentage of population under age 65 without health insurance. (2017)
         uninsured = `Uninsured raw value`,
         #  rate of number of primary care providers/100,000 population (2017)
         primary_care_physicians = `Primary care physicians raw value`,
         #  rate of number of dentistry providers/100,000 population (2018)
         dentists = `Dentists raw value`,
         #  rate of number of mental health providers/100,000 population (2019)
         mental_health_providers = `Mental health providers raw value`,
         # Rate of hospital stays for ambulatory-care sensitive conditions per 100,000 Medicare enrollees. (2017)
         prevent_hospital_stays = `Preventable hospital stays raw value`,
         # Rate of hospital stays for ambulatory-care sensitive conditions per 100,000 Medicare enrollees. (2017)
         prevent_hospital_stays_white = `Preventable hospital stays (White)`,
         # Rate of hospital stays for ambulatory-care sensitive conditions per 100,000 Medicare enrollees. (2017)
         prevent_hospital_stays_black = `Preventable hospital stays (Black)`,
        # `Percentage of fee-for-service (FFS) Medicare enrollees that had an annual flu vaccination.`
        flu_vaccinations = `Flu vaccinations raw value`,
        # Percentage of adults ages 25-44 with some post-secondary education. (2014-2018)
        some_college = `Some college raw value`,
        # Percentage of population ages 16 and older unemployed but seeking work. (2018)
        unemployment = `Unemployment raw value`,
        # Ratio of household income at the 80th percentile to income at the 20th percentile (2014-2018)
        income_inequality = `Income inequality raw value`,
        # Number of membership associations per 10,000 population. (2017)
        social_associations = `Social associations raw value`,
        # Number of reported violent crime offenses per 100,000 population. (2014-2016)
        violent_crime = `Violent crime raw value`,
        # `Number of deaths due to injury per 100,000 population.` (2014-2018)
        injury_deaths = `Injury deaths raw value`,
        # Average daily density of fine particulate matter in micrograms per cubic meter (PM2.5). (2014)
        air_pollution = `Air pollution - particulate matter raw value`,
        # Average number of years a person can expect to live. (2016-2018)
        life_expectancy = `Life expectancy raw value`,
        life_expectancy_black = `Life expectancy (Black)`,
        life_expectancy_white = `Life expectancy (White)`,
        life_expectancy_hisp = `Life expectancy (Hispanic)`,
        life_expectancy_asian = `Life expectancy (Asian/Pacific Islander)`,
        # Number of deaths among residents under age 75 per 100,000 population (age-adjusted). (2016-2018)
        premature_age_adjusted_mortality = `Premature age-adjusted mortality raw value`,
        
        # Percentage of adults reporting 14 or more days of poor physical health per month. (2017)
        frequent_phys_distress = `Frequent physical distress raw value`,
        # Percentage of adults reporting 14 or more days of poor mental health per month. (2017)
        frequent_ment_distress = `Frequent mental distress raw value`,
        # Number of people aged 13 years and older living with a diagnosis of
        # human immunodeficiency virus (HIV) infection per 100,000 population. (2016)
        hiv = `HIV prevalence raw value`,
        # `Number of drug poisoning deaths per 100,000 population. (2016-2018)
        drug_overdose_deaths = `Drug overdose deaths raw value`,
        # The income where half of households in a county earn more and half of households earn less. (2018)
        median_household_income = `Median household income raw value`,
        # Index of dissimilarity where higher values indicate greater residential
        # segregation between Black and White county residents. (2014-2018)
        residential_segregation_black_white = `Residential segregation - Black/White raw value`,
        # Index of dissimilarity where higher values indicate greater residential 
        # segregation between non-White and White county residents. (2014-2018)
        residential_segregation_nonwhite_white = `Residential segregation - non-White/White raw value`,
        # Percentage of occupied housing units that are owned. (2014-2018)
        homeownership = `Homeownership raw value`,
        # The following demographic variables all come from the American Community Survey 2014-2018
        pop = `Population raw value`,
        pop_age_under_18 = `% below 18 years of age raw value`,
        pop_age_65_plus = `% 65 and older raw value`,
        pop_black = `% Non-Hispanic Black raw value`,
        pop_nativeam = `% American Indian & Alaska Native raw value`,
        pop_asian = `% Asian raw value`,
        pop_pacific = `% Native Hawaiian/Other Pacific Islander raw value`,
        pop_hisp = `% Hispanic raw value`,
        pop_white = `% Non-Hispanic White raw value`, 
        pop_non_english_speaker = `% not proficient in English raw value`, # 2014-2018
        pop_female = `% Females raw value`,
        pop_rural = `% Rural raw value`) %>%
  # Now merge in social capital indices
  #left_join(y = read_csv("indices.csv"), by = "fips") %>%
  # Merge in area
  left_join(y = read_csv("raw_data/covariates/area.csv"), by = "fips") %>%
  # Let's impute the  mean for a few key indicators' scores
  group_by(state) %>%
  mutate_at(vars(smoking, drinking_excessive, physical_inactivity,
                 diabetes, obesity, frequent_phys_distress, premature_age_adjusted_mortality,
                 prevent_hospital_stays, primary_care_physicians),
            funs(if_else(is.na(.), true = mean(., na.rm = TRUE), false = .))) %>%
  ungroup() %>%
  mutate_at(vars(smoking, drinking_excessive, physical_inactivity,
                 diabetes, obesity, frequent_phys_distress, premature_age_adjusted_mortality,
                 prevent_hospital_stays, primary_care_physicians),
            funs(scale(.))) %>%
  # Let's also- using the entire country's scores - build a Health Quality score
  rowwise() %>%
  mutate(health_quality = sum( 
    smoking + drinking_excessive + physical_inactivity +
      diabetes + obesity + frequent_phys_distress +
      premature_age_adjusted_mortality ) / 7) %>%
  # Build a Health Care Capacity score
  mutate(health_care_capacity = sum(-1*prevent_hospital_stays + primary_care_physicians) / 2) %>%
  ungroup() %>%
  # export
  saveRDS("raw_data/covariates/chr.rds")

# Identify variables missing less than 5% of values
available <- read_rds("raw_data/covariates/chr.rds") %>% 
  pivot_longer(
    cols = -c(state, fips, county),
    names_to = "measure",
    values_to = "value") %>%
  group_by(measure) %>%
  summarize(available = sum(if_else(!is.na(value), 1, 0), na.rm = TRUE) / n()) %>%
  filter(available >= 0.95) %>%
  select(measure) %>% unlist() %>% unname()

# Now keep just those variables
read_rds("raw_data/covariates/chr.rds") %>%
  select(state, fips, county, available) %>%
  saveRDS("raw_data/covariates/chr.rds")
```

## 2.6 Government Data

```{r}
library(tidyverse)
library(tidycensus)
library(censusapi)

census_api_key("97b29053a89e5396ef806bf7f4fae05b7387fabd")

#vars <- load_variables(year = 2018, dataset = "acs5", cache = TRUE) %>%
#  mutate_at(vars(label, concept), funs(tolower(.)))

# Download number of municipal employees and population
get_acs(geography = "county", 
               survey = "acs5", 
               year = 2018, 
               variables = list(employee_muni = "B08128_006",
                                pop = "B01001_001")) %>%
  select(fips = GEOID, variable,estimate) %>%
  pivot_wider(id_cols = fips, names_from = variable, values_from = estimate) %>%
  # Calculate number of municipal employees per 1000 residents
  mutate(employee_muni = employee_muni / pop * 1000) %>%
  select(-pop) %>%
  saveRDS("raw_data/covariates/employee_muni_2018.rds")


```



## 2.7 Join County dataset

```{r, message=FALSE, warning = FALSE}
# For each period, where date means the END date of the period,
# get the excess deaths

# Leave mobility data as is; we'll import it separately to each dataset,
# since it is so large.
# read_rds("covariates/average_google_mobility.rds")

# Join in CHR data
dat <- read_rds("raw_data/covariates/chr.rds") %>%
  # Join in county election outcomes
  left_join(y = read_csv("raw_data/covariates/county_elections.csv"), by = "fips") %>%
  # Join in municipal employees
  left_join(by = "fips", 
            y = read_rds("raw_data/covariates/employee_muni_2018.rds")) %>%
  # Join in county risk factors
  left_join(by = c("fips" = "geoid"), 
            y = read_rds("raw_data/covariates/acs_risk_county.rds")) 

# Reorder columns for easy viewing
bind_cols(
  dat %>%
    select(state, county, fips),
  dat %>%
    select(-state, -county, -fips)) %>%
  saveRDS("raw_data/covariates/covariates_county.rds")

remove(dat)
```

# 3. Tract Covariates

## 3.1. SVI Data

```{r}
# Create a folder to house SVI data
dir.create("svi")

# We downloaded the CDC's Social Vulnerability Index at the census tract level
# for the year 2018 from the following location on Sun, November 22, 2020:
# https://www.atsdr.cdc.gov/placeandhealth/svi/data_documentation_download.html

# First, resave into .rds format, for quicker access
read_csv("raw_data/covariates/SVI2018_US.csv") %>% 
    select(fips = FIPS, state = ST_ABBR,
         # Grab the SVI
         svi = RPL_THEMES,
         # Grab the four subindices
         svi_socioeconomic = RPL_THEME1, # Socioeconomic – RPL_THEME1
         svi_household_disability = RPL_THEME2, #Household Composition & Disability – RPL_THEME2
         svi_minority = RPL_THEME3, #Minority Status & Language – RPL_THEME3
         svi_housing_tranport = RPL_THEME4 # Housing Type & Transportation – RPL_THEME4
         )  %>%
  # -999 means no data, so let's remove those
    mutate_at(vars(svi:svi_housing_tranport),
            funs(if_else(. == -999, NA_real_, .))) %>%
  # and save in a more comfortable file format
  saveRDS("raw_data/covariates/svi_tract.rds")
```

### Zipcode

```{r}
# Load packages
library(tidyverse)
library(tigris)
library(sf)
library(rgdal)

# Aggregate census tract measures to zipcode level
aggregate_zip = function(MYSTATE){
  
  print(MYSTATE)
  
  # Import 2018 SVI for given state
  tracts <- tigris::tracts(state = MYSTATE, year = 2010, cb = TRUE) %>%
    st_as_sf() %>%
    select(tracts = GEO_ID, geometry) %>%
    mutate(tracts = tracts %>% str_sub(-11, -1)) %>%
    left_join(by = c("tracts" = "fips"),
              y = read_rds("raw_data/covariates/svi_tract.rds"))
  
    
  # For each state, import zipcodes from TIGRIS
  zip <- tigris::zctas(state = MYSTATE, year = 2010) %>%
    st_as_sf()  %>%
    select(zipcode = ZCTA5CE10, geometry)
  
  # Join together the census tract measures into the zipcode
  zip %>%
    st_join(tracts) %>%
    as.data.frame() %>%
    group_by(zipcode) %>%
    summarize(svi = mean(svi, na.rm = TRUE),
              svi_socioeconomic = mean(svi_socioeconomic, na.rm = TRUE),
              svi_household_disability = mean(svi_household_disability, na.rm = TRUE),
              svi_minority = mean(svi_minority, na.rm = TRUE),
              svi_housing_tranport = mean(svi_housing_tranport, na.rm = TRUE)) %>%
    ungroup() %>%
    return()
}

data.frame(state = state.abb) %>%
  split(.$state) %>%
  map_dfr(~aggregate_zip(.$state), .id = "state") %>%
  saveRDS("raw_data/covariates/svi_zipcode.rds")
```


### County subdivision

```{r}
# Load packages
library(tidyverse)
library(tigris)
library(sf)
library(rgdal)

# Aggregate census tract measures to zipcode level
aggregate_csub = function(MYSTATE){
  
  print(MYSTATE)
  
  # Import 2018 SVI for given state
  tracts <- tigris::tracts(state = MYSTATE, year = 2010) %>%
    st_as_sf() %>%
    select(tracts = GEOID10, geometry) %>%
    left_join(by = c("tracts" = "fips"),
              y = read_rds("raw_data/covariates/svi_tract.rds"))
  
    
  # For each state, import zipcodes from TIGRIS
  csub <- tigris::county_subdivisions(state = MYSTATE, year = 2010) %>%
    st_as_sf() %>%
    select(csub = GEOID10, geometry)
  
  # Join together the census tract measures into the zipcode
  csub %>%
    st_join(tracts) %>%
    as.data.frame() %>%
    group_by(csub) %>%
    summarize(svi = mean(svi, na.rm = TRUE),
              svi_socioeconomic = mean(svi_socioeconomic, na.rm = TRUE),
              svi_household_disability = mean(svi_household_disability, na.rm = TRUE),
              svi_minority = mean(svi_minority, na.rm = TRUE),
              svi_housing_tranport = mean(svi_housing_tranport, na.rm = TRUE)) %>%
    ungroup() %>%
    return()
}

data.frame(state = state.abb) %>%
  split(.$state) %>%
  map_dfr(~aggregate_csub(.$state), .id = "state") %>%
  saveRDS("raw_data/covariates/svi_csub.rds")
```

## 3.2 COVID

### Louisiana

```{r}
# Let's import this data from the Louisiana Department of Health
#https://ldh.la.gov/coronavirus/
readxl::read_excel("raw_data/la_covid_testbyweek_tract_publicuse.xlsx") %>%
  magrittr::set_colnames(value = names(.) %>% tolower() %>% str_replace_all(" |[(]|[])]|-", "_")) %>%
  select(tract, date_for_end_of_week, 
         weekly_test_count, weekly_positive_test_count,
         weekly_case_count) %>%
  mutate(positivity_rate = weekly_positive_test_count / weekly_test_count * 100,
         cases_per_test = weekly_case_count / weekly_test_count) %>% 
  mutate_at(vars(positivity_rate, cases_per_test),
            funs(if_else(is.infinite(.), NA_real_, .))) %>%
  saveRDS("raw_data/la_covid.rds")
```

Let's aggregate these census tract results up to the county subdivision level, so we can test the effect of evacuation on these outcomes!

```{r}
# Take our county subdivisions
read_rds("shapes/csub.rds") %>% 
  # Zoom into Louisiana
  filter(str_sub(geoid, 1,2) == "22") %>%
  select(csub = geoid, name, geometry) %>%
  # Spatially join in relevant tracts
  st_join(read_rds("shapes/tracts.rds") %>%
    filter(str_sub(geoid, 1,2) == "22")) %>%
  # Join in our time-series COVID outcomes
  left_join(by = c("geoid" = "tract"), y = read_rds("raw_data/la_covid.rds")) %>%
  # Zoom into just county subdivisions/tracts with valid case rates
  filter(!is.na(cases_per_test)) %>%
  as.data.frame() %>%
  # Finally, let's aggregate up!
  group_by(csub, date_for_end_of_week) %>%
  summarize(positivity_rate = mean(positivity_rate, na.rm = TRUE),
            cases_per_test = mean(cases_per_test, na.rm = TRUE)) %>%
  ungroup()  %>%
# Finally, overwrite previous COVID dataset
  saveRDS("raw_data/la_covid.rds")

```

### California

```{r}
# Download the LA-Times' COVID dataset by place (actually zipcode) via Github
# https://github.com/datadesk/california-coronavirus-data/blob/master/latimes-place-totals.csv
read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv") %>%
  write_csv("raw_data/latimes_covid.csv")

# Let's also read the polygons for these estimates in,
# from the geojson file provided by the LATimes
# These are actually zipcodes
read_sf("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-polygons.geojson")  %>%
  saveRDS("shapes/latimes_places.rds")
```


```{r}
# Get Albers Equal Area Conic Projection
#https://spatialreference.org/ref/esri/north-america-albers-equal-area-conic/
aea <- "+proj=aea +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"

# For each zipcode polygon
zipcodes <- read_rds("shapes/latimes_places.rds") %>% 
              st_transform(crs = aea) %>%
              select(id, geometry)

# Import county subdivisions
csub <- read_rds("shapes/csub.rds") %>%
  # Filter to just those in california
  filter(str_sub(geoid, 1,2) %in% "06") %>%
  st_transform(crs = aea) %>%
  select(geoid, geometry)

# Create a crosswalk between zipcodes and county subdivisions
crosswalk <- zipcodes %>%
  # Join in county subdivisions
  st_join(csub) %>%
  # Focusing only on counties affected
  filter(!is.na(geoid)) %>%
  as.data.frame() %>%
  # Listing the zipcodes (id) within every subdivision (geoid)
  select(id, geoid) %>%
  distinct()

  # Join in the zipcode daily observations
read_csv("raw_data/latimes_covid.csv") %>% 
  select(id, date, confirmed_cases, population)  %>%
  #########################################################
  # 1. Convert from Cumulative Daily Cases to Daily Cases
  #########################################################
  arrange(id, date) %>%
  # For each zipcode,
  group_by(id) %>%
  # Calculate for each day how many more confirmed cases there are now
  # than there were one day before
  mutate(confirmed_cases = confirmed_cases - lag(confirmed_cases, 1)) %>%
  ungroup() %>%
  # If ever they report a negative number of daily cases, that is incorrect;
  # set it back to 0
  mutate(confirmed_cases = if_else(confirmed_cases < 0, 0, confirmed_cases)) %>%
  #########################################################
  # 2. Convert from Daily Cases to Cases in Week Prior
  #########################################################
  # Split each date into weeks
  mutate(week = lubridate::floor_date(date, unit = "weeks", week_start = 1)) %>%
  # Now for each date, identify the START DATE of the NEXT week,
  # so that we can tally how many cases each place saw in the week prior
  arrange(id, date) %>%
  group_by(id) %>%
  mutate(week = lead(week, 7)) %>%
  ungroup() %>%
  group_by(id, week) %>%
  summarize(confirmed_cases = sum(confirmed_cases, na.rm = TRUE),
            population = unique(population)) %>%
  ungroup() %>%
  #########################################################
  # 3. Convert from Cases in Week Prior to Case Rate
  #########################################################
  # Now filter to this date range
              filter(week >= "2020-08-01",
                     week <= "2020-12-31") %>%
  # Calculate the daily case rate per 100,000 people
  mutate(case_rate = confirmed_cases / population * 100000) %>%
  select(id, week, case_rate) %>%
  #########################################################
  # 4. Convert from Zipcode Level to County Subdivision Level
  #########################################################
  # Join in the county subdivision id
  left_join(by = "id", y = crosswalk) %>%
  # Aggregate by county subdivision, 
  # getting the average case rate per county subdivision
  group_by(geoid, week) %>%
  summarize(case_rate = mean(case_rate, na.rm = TRUE)) %>%
  ungroup() %>%
  # Fix infinites (which happen when you report a case in a zipcode with 0 population);
  # We will classify these as NA
  mutate(case_rate = if_else(is.infinite(case_rate), NA_real_, case_rate)) %>%
  # Filter out the NAs. We will not look at county subdivisions where no one lives
  filter(!is.na(case_rate)) %>%
  saveRDS("raw_data/ca_covid.rds")

remove(zipcode, csub, crosswalk)
```



## 3.3 Facebook Mobility

### Louisiana

```{r}
# 3. Data


intra_more <- read_rds("raw_data/hurricane_zeta_la.rds") %>%
  activate("edges") %>%
  # Zoom into movement between neighborhoods within the same subdivision
  filter(from == to) %>%
  filter(evacuation > 0) %>%
  mutate(from_csub = .N()$geoid[from],
         to_csub = .N()$geoid[to]) %>%
  as.data.frame() %>%
  # Calculate total evacuation within a subdivision
  group_by(csub = from_csub, date_time) %>%
  summarize(evacuation = sum(evacuation, na.rm = TRUE)) %>%
  ungroup()

# Calculate decrease in movement within subdivision
intra_less <- read_rds("raw_data/hurricane_zeta_la.rds") %>%
  activate("edges") %>%
  # Zoom into movement between neighborhoods within the same subdivision
  filter(from == to) %>%
  # Evaluate DECREASE in movement
  filter(evacuation < 0) %>%
  mutate(from_csub = .N()$geoid[from],
         to_csub = .N()$geoid[to]) %>%
  as.data.frame() %>%
  # Calculate total evacuation within a census csub
  group_by(csub = from_csub, date_time) %>%
  summarize(evacuation = sum(abs(evacuation), na.rm = TRUE)) %>%
  ungroup()

# Calculate increase in movement between census csub
inter_more <- read_rds("raw_data/hurricane_zeta_la.rds") %>%
  activate("edges") %>%
  # Zoom into movement between different census csub
  filter(from != to) %>%
  # Evaluate INCREASE in movement
  filter(evacuation > 0) %>%
  mutate(from_csub = .N()$geoid[from],
         to_csub = .N()$geoid[to]) %>%
  as.data.frame()
# Now calculate it for both sender and receivers
inter_more <- bind_rows(
  inter_more %>%
    group_by(csub = from_csub, date_time) %>%
    summarize(evacuation = sum(evacuation, na.rm = TRUE)) %>%
    ungroup(), 
  inter_more %>%
    group_by(csub = to_csub, date_time) %>%
    summarize(evacuation = sum(evacuation, na.rm = TRUE)) %>%
    ungroup()) %>%
  group_by(csub, date_time) %>%
  summarize(evacuation = sum(evacuation, na.rm = TRUE)) %>%
  ungroup()


# Calculate DECREASE in movement between census csub
inter_less <- read_rds("raw_data/hurricane_zeta_la.rds") %>%
  activate("edges") %>%
  # Zoom into edges between different census csub
  filter(from != to) %>%
  # Evaluate DECREASE in movement
  filter(evacuation < 0) %>%
  mutate(from_csub = .N()$geoid[from],
         to_csub = .N()$geoid[to]) %>%
  as.data.frame()
# Now calculate it for both sender and receivers
inter_less <- bind_rows(
  inter_less %>%
    group_by(csub = from_csub, date_time) %>%
    summarize(evacuation = sum(abs(evacuation), na.rm = TRUE)) %>%
    ungroup(), 
  inter_less %>%
    group_by(csub = to_csub, date_time) %>%
    summarize(evacuation = sum(abs(evacuation), na.rm = TRUE)) %>%
    ungroup()) %>%
  group_by(csub, date_time) %>%
  summarize(evacuation = sum(evacuation, na.rm = TRUE)) %>%
  ungroup()

# Bind these lists together into a single data.frame
bind_rows(
  intra_more, intra_less,
  inter_more, inter_less, .id = "type") %>%
  mutate(type = recode_factor(
    type,
    "1" = "evacuation_intra_more", "2" = "evacuation_intra_less", 
    "3" = "evacuation_inter_more", "4" = "evacuation_inter_less")) %>%
  pivot_wider(
    id_cols = c(csub, date_time),
    names_from = type,
    values_from = evacuation) %>%
  mutate_at(vars(matches("evacuation")), 
            funs(if_else(is.na(.), as.numeric(0), as.numeric(.)))) %>%
  mutate(evacuation_more = evacuation_intra_more + evacuation_inter_more,
         evacuation_less = evacuation_intra_less + evacuation_inter_less) %>%
  saveRDS("raw_data/hurricane_zeta_csub_movement.rds")



# Thanks to this helpful answer on Stack Overflow:
# https://stackoverflow.com/questions/57893554/get-next-wednesday-date-after-a-date-with-r
#This sets the start day of the week to be Wednesday (7 is the default, Sunday)
options(lubridate.week.start=3)
#This assigns each day in date_time to the next unit (Wednesday) above it and stores it

read_rds("raw_data/hurricane_zeta_csub_movement.rds") %>%
  mutate(week = lubridate::ceiling_date(date_time, unit = c('week'))) %>%
  group_by(csub, week) %>%
  summarize(evacuation_intra_more = sum(evacuation_intra_more, na.rm = TRUE),
            evacuation_intra_less = sum(evacuation_intra_less, na.rm = TRUE),
            evacuation_inter_more = sum(evacuation_inter_more, na.rm = TRUE),
            evacuation_inter_less = sum(evacuation_inter_less, na.rm = TRUE),
            evacuation_more = sum(evacuation_more, na.rm = TRUE),
            evacuation_less = sum(evacuation_less, na.rm = TRUE)) %>%
  ungroup() %>%
  saveRDS("raw_data/hurricane_zeta_csub_movement_weekly.rds")

#Restore default
options(lubridate.week.start=7)

rm(list=  ls())
```

### California

```{r}
# 3. Data

intra_more <- read_rds("raw_data/glass_fire_ca.rds") %>%
  activate("edges") %>%
  # Zoom into movement between neighborhoods within the same subdivision
  filter(from == to) %>%
  filter(evacuation > 0) %>%
  mutate(from_csub = .N()$geoid[from],
         to_csub = .N()$geoid[to]) %>%
  as.data.frame() %>%
  # Calculate total evacuation within a subdivision
  group_by(csub = from_csub, date_time) %>%
  summarize(evacuation = sum(evacuation, na.rm = TRUE)) %>%
  ungroup()

# Calculate decrease in movement within subdivision
intra_less <- read_rds("raw_data/glass_fire_ca.rds") %>%
  activate("edges") %>%
  # Zoom into movement between neighborhoods within the same subdivision
  filter(from == to) %>%
  # Evaluate DECREASE in movement
  filter(evacuation < 0) %>%
  mutate(from_csub = .N()$geoid[from],
         to_csub = .N()$geoid[to]) %>%
  as.data.frame() %>%
  # Calculate total evacuation within a census csub
  group_by(csub = from_csub, date_time) %>%
  summarize(evacuation = sum(abs(evacuation), na.rm = TRUE)) %>%
  ungroup()

# Calculate increase in movement between census csub
inter_more <- read_rds("raw_data/glass_fire_ca.rds") %>%
  activate("edges") %>%
  # Zoom into movement between different census csub
  filter(from != to) %>%
  # Evaluate INCREASE in movement
  filter(evacuation > 0) %>%
  mutate(from_csub = .N()$geoid[from],
         to_csub = .N()$geoid[to]) %>%
  as.data.frame()
# Now calculate it for both sender and receivers
inter_more <- bind_rows(
  inter_more %>%
    group_by(csub = from_csub, date_time) %>%
    summarize(evacuation = sum(evacuation, na.rm = TRUE)) %>%
    ungroup(), 
  inter_more %>%
    group_by(csub = to_csub, date_time) %>%
    summarize(evacuation = sum(evacuation, na.rm = TRUE)) %>%
    ungroup()) %>%
  group_by(csub, date_time) %>%
  summarize(evacuation = sum(evacuation, na.rm = TRUE)) %>%
  ungroup()


# Calculate DECREASE in movement between census csub
inter_less <- read_rds("raw_data/glass_fire_ca.rds") %>%
  activate("edges") %>%
  # Zoom into edges between different census csub
  filter(from != to) %>%
  # Evaluate DECREASE in movement
  filter(evacuation < 0) %>%
  mutate(from_csub = .N()$geoid[from],
         to_csub = .N()$geoid[to]) %>%
  as.data.frame()
# Now calculate it for both sender and receivers
inter_less <- bind_rows(
  inter_less %>%
    group_by(csub = from_csub, date_time) %>%
    summarize(evacuation = sum(abs(evacuation), na.rm = TRUE)) %>%
    ungroup(), 
  inter_less %>%
    group_by(csub = to_csub, date_time) %>%
    summarize(evacuation = sum(abs(evacuation), na.rm = TRUE)) %>%
    ungroup()) %>%
  group_by(csub, date_time) %>%
  summarize(evacuation = sum(evacuation, na.rm = TRUE)) %>%
  ungroup()

# Bind these lists together into a single data.frame
bind_rows(
  intra_more, intra_less,
  inter_more, inter_less, .id = "type") %>%
  mutate(type = recode_factor(
    type,
    "1" = "evacuation_intra_more", "2" = "evacuation_intra_less", 
    "3" = "evacuation_inter_more", "4" = "evacuation_inter_less")) %>%
  pivot_wider(
    id_cols = c(csub, date_time),
    names_from = type,
    values_from = evacuation) %>%
  mutate_at(vars(matches("evacuation")), 
            funs(if_else(is.na(.), as.numeric(0), as.numeric(.)))) %>%
  mutate(evacuation_more = evacuation_intra_more + evacuation_inter_more,
         evacuation_less = evacuation_intra_less + evacuation_inter_less) %>%
  saveRDS("raw_data/glass_fire_csub_movement.rds")



# Thanks to this helpful answer on Stack Overflow:
# https://stackoverflow.com/questions/57893554/get-next-wednesday-date-after-a-date-with-r
#This sets the start day of the week to be Monday (7 is the default, Sunday)
options(lubridate.week.start=1)
#This assigns each day in date_time to the next unit (Wednesday) above it and stores it

read_rds("raw_data/glass_fire_csub_movement.rds") %>%
  mutate(week = lubridate::ceiling_date(date_time, unit = c('week'))) %>%
  group_by(csub, week) %>%
  summarize(evacuation_intra_more = sum(evacuation_intra_more, na.rm = TRUE),
            evacuation_intra_less = sum(evacuation_intra_less, na.rm = TRUE),
            evacuation_inter_more = sum(evacuation_inter_more, na.rm = TRUE),
            evacuation_inter_less = sum(evacuation_inter_less, na.rm = TRUE),
            evacuation_more = sum(evacuation_more, na.rm = TRUE),
            evacuation_less = sum(evacuation_less, na.rm = TRUE)) %>%
  ungroup() %>%
  saveRDS("raw_data/glass_fire_csub_movement_weekly.rds")

#Restore default
options(lubridate.week.start=7)
rm(list=  ls())
```

## 3.4 Rainfall

Next, we're going to approximate how much rain fell in each census tract every week during our study period.

NOAA has an excellent dataset hosted by the Global Historical Climatology Network (https://www.ncdc.noaa.gov/data-access/land-based-station-data/land-based-datasets/global-historical-climatology-network-ghcn). GHCN "is an integrated database of climate summaries from land surface stations across the globe that have been subjected to a common suite of quality assurance reviews. The data are obtained from more than 20 sources. Some data are more than 175 years old while others are less than an hour old." They host data on an FTP server that contains daily precipitation, broken into yearly datasets (ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/). A README file is available here (https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/readme.txt). The locations of recording stations is available here (https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt), and we can use these to locate and spatially interpolate these rainfall levels.

### Get data

```{r}
# First, let's make a rainfall folder
dir.create("raw_data/covariates/rainfall")

# Get Albers Equal Area Conic Projection
#https://spatialreference.org/ref/esri/north-america-albers-equal-area-conic/
aea <- "+proj=aea +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"



# File readme available here:
# https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/readme.txt

# Station information
# https://catalog.data.gov/dataset/global-historical-climatology-network-daily-ghcn-daily-version-3/resource/bbd84e54-5a2e-4b2b-bedf-e6611b5113d7
# https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt

read_delim(
  file = "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt", 
  delim = " ", 
  col_names = FALSE) %>%
  dplyr::select(station_id = 1, latitude = 2, longitude = 3) %>%
  mutate_at(vars(latitude, longitude), funs(. %>% as.numeric)) %>%
  # Convert to an SF object with the following coordinates
  st_as_sf(coords = c("longitude", "latitude"), crs = CRS(paste0("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"))) %>%
  saveRDS("raw_data/covariates/rainfall/us_stations.rds")


# Now download 2020 rainfall data
# We're going to have to download it into a different directory, since it is HUGE.
download.file(
  url = "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2020.csv.gz",
    destfile = "raw_data/covariates/rainfall/us_rainfall_2020.csv.gz")


# Write decompressed data
# (Note: This requires a LOT of computational power. 
# I had to scale up my cloud server from 2 GB of RAM 
# to 8 GB of RAM to pull this off.)
read_csv("raw_data/covariates/rainfall/us_rainfall_2020.csv.gz", 
         col_names = FALSE) %>% 
  # Grab just these vectors
  dplyr::select(station_id = 1, date = 2, type = 3, value = 4) %>%
  # Grab entries from August to December
  filter(str_sub(as.character(date), 5,6) %in% c("08", "09", "10", "11", "12")) %>%
  # Filter to just precipitation
  filter(type == "PRCP") %>%
  # Convert dates to appropriate format
    mutate(date = lubridate::make_date(
    year = str_sub(date, 1,4),
    month = str_sub(date, 5,6),
    day = str_sub(date, 7,8))) %>%
  # save to file
  saveRDS("raw_data/covariates/rainfall/us_rainfall_2020.rds")

# Now remove raw US rainfall
unlink("raw_data/covariates/rainfall/us_rainfall_2020.csv.gz")
```

### Get Subset of Stations

```{r}
## For the US, let's use the Contiguous US Albers Equal Area Conic Projection
#https://spatialreference.org/ref/esri/north-america-albers-equal-area-conic/
aea <- "+proj=aea +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"


counties <- read_rds("raw_data/hurricane_zeta_counties.rds")
# Import census tracts
#tracts <- read_rds("shapes/tracts.rds") %>%
#  filter(str_sub(geoid, 1,5) %in% counties) %>%
#  st_transform(crs = aea)
csub <- read_rds("shapes/csub.rds") %>%
  filter(str_sub(geoid, 1,2) %in% "22") %>%
  st_transform(crs = aea)


ggplot() +
  geom_sf(data = csub) +
  geom_sf(data = read_rds("raw_data/covariates/rainfall/us_stations.rds") %>%
  sample_n(size = 2000), color = "red") +
  theme_void()

read_rds("raw_data/covariates/rainfall/us_stations.rds") %>%
  st_transform(crs = aea) %>%
  #st_transform(crs = 4326) %>%
  #st_set_crs(aea) %>%
#  st_set_crs(aea) %>% 
#  sample_n(size = 2000) %>%
  # Now join the codes of county subdivisions in the study area
  st_join(csub) %>%
  # Filter to stations in the study area
  filter(!is.na(geoid)) %>%
  # save to file
  saveRDS("raw_data/covariates/rainfall/la_stations.rds")

# These are the locations of weather stations
ggplot() +
  geom_sf(data = csub, mapping = aes(geometry=geometry), 
          color = "darkgrey", fill = "black", size = 5) +
  geom_sf(data = csub, mapping = aes(geometry=geometry), 
          color = "grey", fill = "black", size = 0.1) +
  geom_sf(data = read_rds("raw_data/covariates/rainfall/la_stations.rds"), 
          mapping = aes(geometry=geometry), 
          color = "lightblue", size = 2, alpha = 0.75) +
  theme_void(base_size = 14) +
  theme(plot.subtitle = element_text(hjust = 0.5)) +
  labs(subtitle = "Land Surface Rainfall Data Stations in Louisiana\n(n = 727 stations)") +
  ggsave("viz/la_rainfall_stations.png", dpi = 300)
```

### Interpolate

```{r}
## For the US, let's use the Contiguous US Albers Equal Area Conic Projection
#https://spatialreference.org/ref/esri/north-america-albers-equal-area-conic/
aea <- "+proj=aea +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"

#counties <- read_rds("raw_data/hurricane_zeta_counties.rds")

# Import census tracts
csub <- read_rds("shapes/csub.rds") %>%
  # Zoom into just Louisiana
  filter(str_sub(geoid, 1,2) %in% "22") %>%
  st_transform(crs = aea)

# The shapefile is measured in meters,
# so when we make the fishnet cellsize 7500,
# that means 15,000 m (15 km).

csub %>% 
  st_make_grid(cellsize = 15000, what = "polygons") %>%
  st_as_sf() %>%
  mutate(id = 1:n()) %>%
  select(id, geometry = x) %>%
  saveRDS("raw_data/covariates/rainfall/la_fishnet.rds")

# Visualize fishnet grid here
ggplot() +
  # Plot background
  geom_sf(data = csub, fill = "black", color = NA) +
  # Plot fishnet
  geom_sf(data = read_rds("raw_data/covariates/rainfall/la_fishnet.rds"), 
          color = "white", size = 0.1, fill = NA) +
  # Plot station locations
  geom_sf(data = read_rds("raw_data/covariates/rainfall/la_stations.rds"),
          mapping = aes(geometry=geometry), 
          color = "cyan", size = 0.5) +
  theme_void()
```



```{r, message=FALSE, warning = FALSE}

#This sets the start day of the week to be Wednesday (7 is the default, Sunday)
options(lubridate.week.start=3)
#This assigns each day in date_time to the next unit (Wednesday) above it and stores it

# Import rainfall data
rain <- read_rds("raw_data/covariates/rainfall/la_stations.rds") %>%
  left_join(by = "station_id", 
            y = read_rds("raw_data/covariates/rainfall/us_rainfall_2020.rds") %>% 
              # Now filter to just rain measurements from these sites
              filter(station_id %in% read_rds("raw_data/covariates/rainfall/la_stations.rds")$station_id) %>%
              filter(type == "PRCP") %>%
              # Aggregate by week
              mutate(week = lubridate::ceiling_date(date, unit = c('week'))) %>%
              group_by(station_id, week) %>%
              summarize(rain = sum(value, na.rm = TRUE))) %>%
  filter(!is.na(rain))


# Import fishnet grid
nodes <- read_rds("raw_data/covariates/rainfall/la_fishnet.rds")

interpolate = function(DATE){
  print(DATE)
  # Make a grid of shapes from the polygon
  m <- gstat::gstat(id = "id", formula = rain ~ 1, 
                    data = rain %>%
                      filter(week == DATE) %>%
                      na.omit() %>%
                      as("Spatial"),
                    set = list(idp = 2))
  # Generate predictions
  predict(m, nodes) %>%
    dplyr::select(predicted = id.pred, geometry) %>%
    # Add in a unique id from the original grid cells
    mutate(id = nodes$id) %>%
    mutate(week = DATE) %>%
    return()
  
}

data.frame(week = rain$week %>% unique()) %>%
  split(.$week) %>%
  map(~interpolate(.$week), .id = "week") %>%
  dplyr::bind_rows() %>%
  saveRDS("raw_data/covariates/rainfall/la_rainfall_predicted.rds")
```

```{r}
grid <- read_rds("raw_data/covariates/rainfall/la_rainfall_predicted.rds") %>%
  st_as_sf() %>%
  # Give each their unique ID (since each prediction was made in order of the original nodes dataset)
  group_by(week) %>%
  mutate(id = 1:n()) %>%
  ungroup() %>%
  as.data.frame() %>%
  select(-geometry)

# Import fishnet grid
nodes <- read_rds("raw_data/covariates/rainfall/la_fishnet.rds") %>%
  left_join(by = "id", y = grid)

remove(grid)

# Average predictions among county subdivisions
csub %>%
  st_join(nodes) %>%
  as.data.frame() %>%
  select(-geometry) %>%
  group_by(geoid, week) %>%
  summarize(avg_rainfall = mean(predicted, na.rm = TRUE)) %>%
  ungroup() %>%
  saveRDS("raw_data/covariates/rainfall/la_rainfall_csub.rds")

# Join rainfall into tracts
rain <- csub %>%
  left_join(by = "geoid", 
            y = read_rds("raw_data/covariates/rainfall/la_rainfall_csub.rds")) 

# Finally, let's visualze this pattern of weekly rainfall, a useful way of adjusting for exposure and damage to multiple storms during this hurricane season.
ggplot() +
  geom_sf(data = rain %>%
            filter(week < "2020-12-24"),
          # It's measured in tenths of a millimeter, so let's convert to just mm
          mapping = aes(fill = avg_rainfall / 10), color = NA) +
  theme_void() +
  scale_fill_viridis(option = "plasma") +
  facet_wrap(~week, ncol = 7) +
  theme(plot.caption = element_text(hjust = 0.5),
        legend.position = "bottom") +
  guides(fill = guide_colorbar(barwidth = 10, barheight = 0.5)) +
  labs(fill = "Weekly\nRainfall (mm)",
       caption = "Spatially interpolated across 15 km grid cells and averaged by census tract.") +
  ggsave("viz/rainfall_csub.png", dpi = 500, width = 8, height = 5)
```

```{r}
remove(m, nodes, pred, shapes, rain, csub)

```

## 3.5 Burn Area

```{r}
# Import counties
mycounties <- read_rds("shapes/counties.rds") %>% 
  filter(str_sub(geoid, 1,2) %in% "06") %>%
  as.data.frame() %>%
  select(geoid, name, area_land_county = area_land) %>%
  # Convert area from square meters to acres
  mutate(area_land_county = area_land_county * 0.000247105)

make_grid = function(mydata){
  data.frame(
    county = mydata$county,
    acres_burned = mydata$acres_burned,
    date = seq(from = mydata$start, to = mydata$end, by = "day")) %>%
    return()
}


# Download California fires dataset
read_csv("https://docs.google.com/spreadsheets/d/1UgRFtuwW2xguQbQn_tj9xpFbgY7v6HyW8SZSP1z0SDw/export?format=csv") %>%
  saveRDS("raw_data/ca_fires.rds")

# Estimate burn area per day
fire <- read_rds("raw_data/ca_fires.rds") %>%
  # Get a unique id for each fire
  mutate(id = 1:n()) %>%
  # Split county column into one cell per county
  separate(col = county, into = paste("county", 1:9, sep = "_"), sep = ",") %>%
  # then flip into a long, tidy data.frame
  pivot_longer(cols = contains("county"), names_to = "group", values_to = "county") %>%
  # eliminating empty rows
  filter(!is.na(county)) %>%
  mutate(county = str_trim(county, side = "both")) %>%
  # and converting the date to date format
  mutate_at(vars(start,end),funs(lubridate::mdy(.))) %>%
  # calculate length of interval in days
  mutate(length = lubridate::interval(start, end) / lubridate::duration(num = 1, units = "days")) %>%
  # calculate number of counties affected per fire,
  # and normalized the burn area by the number of counties it totals
  # and divide it by the number of days that fire ran
  group_by(id, name) %>%
  mutate(n_counties = n(),
         acres_burned = acres_burned / n_counties / length) %>%
  ungroup() %>%
  distinct() %>%
  select(id, county, acres_burned, start, end) %>%
  # Build out a grid of county-days
  mutate(pair = paste(id, county, sep = "-")) %>%
  split(.$pair) %>%
  map_dfr(~make_grid(.)) %>% 
  # Some counties experienced multiple fires.
  # calculate the total acres burned, on average, per day
  group_by(county, date) %>%
  summarize(acres_burned = sum(acres_burned, na.rm = TRUE)) %>%
  ungroup()  %>%
  # Join in total land area of each county
  left_join(by = c("county" = "name"), y = mycounties) %>%
  # For each county, calculate the average number of acres burned...
  # per day, given the number of days the fire transpired,
  # per acre of total land in that county 
  saveRDS("raw_data/ca_burn_area.rds")

# https://stackoverflow.com/questions/57893554/get-next-wednesday-date-after-a-date-with-r
#This sets the start day of the week to be Monday (7 is the default, Sunday)
options(lubridate.week.start=1)
#This assigns each day in date_time to the next unit (Wednesday) above it and stores it

# Calculate weekly burn rate
read_rds("raw_data/ca_burn_area.rds") %>% 
  mutate(week = lubridate::ceiling_date(date, unit = c('week'))) %>%
  # By totalling up the approximate amount of acres burned per week
  group_by(geoid, week) %>%
  summarize(acres_burned = sum(acres_burned, na.rm = TRUE),
            area_land_county = unique(area_land_county)) %>%
  ungroup() %>%
  # And then calculating the percentage burned out of all land in the county
  mutate(burn_rate = acres_burned / area_land_county * 100) %>%
  select(geoid, week, burn_rate) %>%
  # if the burn rate is non-zero, mark as an active fire
  mutate(active_fire = if_else(burn_rate > 0, 1, 0)) %>%
  saveRDS("raw_data/ca_burn_rate.rds")

#Restore default
options(lubridate.week.start=7)
rm(list=  ls())
```


## 3.6 Join Subdivision Dataset

### Louisiana

```{r, message = FALSE, warning = FALSE}
# Let's make a grid
expand_grid(
  csub = read_rds("raw_data/la_covid.rds")$csub %>% unique(),
            date_for_end_of_week = read_rds("raw_data/la_covid.rds")$date_for_end_of_week %>% unique()) %>%
  # Join in covid data
  left_join(by = c("csub", "date_for_end_of_week"),
            y = read_rds("raw_data/la_covid.rds")) %>%
  # If any case rates are missing, that's because there were 
  # no cases identified in that subdivision during that week.
  mutate(positivity_rate = if_else(is.na(positivity_rate), 0, as.numeric(positivity_rate)),
         cases_per_test =  if_else(is.na(cases_per_test), 0, as.numeric(cases_per_test))) %>%
  ################################################
  # Join in key predictors
  ################################################
  # Join in Social Capital Indices at subdivision Level
  left_join(by = c("csub" = "csub"), 
            y = read_csv("raw_data/covariates/sci_county_subdivisions_2021_01_28.csv") %>%
              filter(state == "LA", year == 2018) %>%
              select(csub, social_capital, bonding, bridging, linking)) %>%
  # Join in Social Vulnerability Indices at subdivision Level
  left_join(by = c("csub" = "csub"), 
            y = read_rds("raw_data/covariates/svi_csub.rds")) %>% 
  # Let's join in county traits using a county variable
  mutate(county = str_sub(csub, 1,5)) %>%
  left_join(by = c("county" = "fips"), 
            y = read_rds("raw_data/covariates/covariates_county.rds") %>% 
              select(-state, -county)) %>%
  # Let's also join in Facebook mobility
  left_join(y = read_rds("raw_data/hurricane_zeta_csub_movement_weekly.rds"),
            by = c("csub", "date_for_end_of_week" = "week")) %>%
  # Fill in blanks here with zeros
  mutate_at(vars(contains("evacuation")), 
            funs(if_else(is.na(.), 0, .))) %>%
  # Let's also join in county mobility by Google
  left_join(by = c("date_for_end_of_week" = "date", "county" = "fips"),
            y = read_rds("raw_data/covariates/average_google_mobility.rds")) %>%
  # Also, join in weekly rainfall
  left_join(by = c("date_for_end_of_week" = "week", "csub" = "geoid"),
            y = read_rds("raw_data/covariates/rainfall/la_rainfall_csub.rds")) %>%
  rename(week = date_for_end_of_week) %>%
    mutate(pop_density = pop / area_land) %>%
  ################################################
  # Design - lags and sample region
  ################################################
   # Zoom into just the study region
  filter(county %in% read_rds("raw_data/hurricane_zeta_counties.rds")) %>%
  # Please BRING FORWARD all evacuation measures by TWO weeks,
  # so that the evacuation from TWO WEEKS AGO now corresponds 
  # to the COVID outcomes of THIS WEEK.
  arrange(csub, desc(week)) %>%
  group_by(csub) %>%
  mutate_at(vars(contains("evacuation"), avg_rainfall),
            funs(lead(., 2))) %>%
  ungroup() %>%
  
  # Let's fill in the new missing spots created by lagging with zeros - 
  # this is fine because the evacuation and Zeta, which 
  # these variables are measuring don't happen for months, 
  # so their value should be zero at the beginning
  mutate_at(vars(contains("evacuation")),
            funs(if_else(is.na(.), 0, as.numeric(.)))) %>%
  # Start in August
  mutate(week = as.Date(week)) %>%
  filter(week > "2020-08-01") %>%
  # Cut off before Holidays
  filter(week < "2020-12-24") %>%
  
  ################################################
  # Extra Variables and Mean Imputation
  ################################################
    # Add a weekly counter since start of period
  mutate(counter = lubridate::interval("2020-08-01", week) / lubridate::duration(num = 1, units = "weeks")) %>%
  # Adjust variables
  # Control for population among evacuation tallies 
  mutate_at(vars(contains("evacuation")),
            funs(. / pop * 1000)) %>%
  # SVI is missing 12 data points - just one tract. Fill it in, using county mean.
  group_by(county = str_sub(csub, 1,5)) %>%
  mutate_at(vars(contains("svi")), funs(if_else(is.na(.), mean(., na.rm = TRUE), .))) %>%
  # Workplaces is missing 152, from the same few places
  mutate(workplaces = if_else(
    condition = is.na(workplaces), 
    true = mean(workplaces, na.rm = TRUE),
    false = workplaces)) %>%
  ungroup() %>%
  
  saveRDS("raw_data/la_dataset.rds")

```

### California

```{r, message = FALSE, warning = FALSE}

# Let's make a grid
expand_grid(
  csub = read_rds("raw_data/ca_covid.rds")$geoid %>% unique(),
            date_for_end_of_week = read_rds("raw_data/ca_covid.rds")$week %>% unique()) %>%
  # Join in covid data
  left_join(by = c("csub" = "geoid", "date_for_end_of_week" = "week"),
            y = read_rds("raw_data/ca_covid.rds")) %>%
  # If any case rates are missing, that's because there were 
  # no cases identified in that subdivision during that week.
  mutate(case_rate = if_else(is.na(case_rate), 0, as.numeric(case_rate))) %>%
  ################################################
  # Join in key predictors
  ################################################
  # Join in Social Capital Indices at subdivision Level
  left_join(by = c("csub" = "csub"), 
            y = read_csv("raw_data/covariates/sci_county_subdivisions_2021_01_28.csv") %>%
              filter(state == "CA", year == 2018) %>%
              select(csub, social_capital, bonding, bridging, linking)) %>%
  # Join in Social Vulnerability Indices at subdivision Level
  left_join(by = c("csub" = "csub"), 
            y = read_rds("raw_data/covariates/svi_csub.rds")) %>% 
  # Let's join in county traits using a county variable
  mutate(county = str_sub(csub, 1,5)) %>%
  left_join(by = c("county" = "fips"), 
            y = read_rds("raw_data/covariates/covariates_county.rds") %>% 
              select(-state, -county)) %>%
  # Let's also join in Facebook mobility
  left_join(y = read_rds("raw_data/glass_fire_csub_movement_weekly.rds"),
            by = c("csub", "date_for_end_of_week" = "week")) %>%
  # Fill in blanks here with zeros
  mutate_at(vars(contains("evacuation")), 
            funs(if_else(is.na(.) | is.infinite(.), 0, .))) %>%
  # Let's also join in county mobility by Google
  left_join(by = c("date_for_end_of_week" = "date", "county" = "fips"),
            y = read_rds("raw_data/covariates/average_google_mobility.rds")) %>%
  rename(week = date_for_end_of_week) %>%
  # Join in a weekly burn rate and identifier for active fires
  left_join(by = c("week" = "week", "county" = "geoid"),
            y = read_rds("raw_data/ca_burn_rate.rds")) %>%
  # also calculate population density
  mutate(pop_density = pop / area_land) %>%
    ################################################
  # Design - lags and sample region
  ################################################
   # Zoom into just the study region
  filter(county %in% read_rds("raw_data/glass_fire_counties.rds")) %>%
  # Please BRING FORWARD all evacuation measures by TWO weeks,
  # so that the evacuation from TWO WEEKS AGO now corresponds 
  # to the COVID outcomes of THIS WEEK.
  arrange(csub, desc(week)) %>%
  group_by(csub) %>%
  mutate_at(vars(contains("evacuation"), active_fire, burn_rate),
            funs(lead(., 2))) %>%
  # Let's fill in the new missing spots created by lagging with zeros - 
  # this is fine because the evacuation and active fires that 
  # these variables are measuring don't happen for months, 
  # so their value should be zero at the beginning
  mutate_at(vars(contains("evacuation"), active_fire, burn_rate),
            funs(if_else(is.na(.), 0, as.numeric(.)))) %>%
  ungroup() %>%
  # Start in August
  mutate(week = as.Date(week)) %>%
  filter(week > "2020-08-01") %>%
  # Cut off before Holidays
  filter(week < "2020-12-24") %>%
  ################################################
  # Extra Variables and Mean Imputation
  ################################################
    # Add a weekly counter since start of period
  mutate(counter = lubridate::interval("2020-08-01", week) / lubridate::duration(num = 1, units = "weeks")) %>%
  # Adjust variables
  # Control for population among evacuation tallies 
  mutate_at(vars(contains("evacuation")),
            funs(. / pop * 1000)) %>%
  group_by(county = str_sub(csub, 1,5)) %>%
  # Workplaces is missing 25, from the same few places, fill in with county mean
  mutate(workplaces = if_else(
    condition = is.na(workplaces), 
    true = mean(workplaces, na.rm = TRUE),
    false = workplaces)) %>%
  ungroup() %>%
  saveRDS("raw_data/ca_dataset.rds")
```

## 3.7 Covariate Interpolation

Finally, these datasets are quite large, but not quite large enough to handle nested random effects between county subdivisions AND counties. Instead, we are going to spatially interpolate the county level covariates so that they better estimate variation over space.

## California

```{r}
# Make a folder for covariate interpolation
dir.create("raw_data/interpolate")

# Get Albers Equal Area Conic Projection
#https://spatialreference.org/ref/esri/north-america-albers-equal-area-conic/
aea <- "+proj=aea +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"


# The shapefile is measured in meters,
# so when we make the fishnet cellsize 7500,
# that means 15,000 m (15 km).

# Generate a fishnet grid for the fires
read_rds("shapes/csub.rds") %>%
  filter(str_sub(geoid, 1,5) %in% read_rds("raw_data/glass_fire_counties.rds")) %>%
  st_transform(crs = aea) %>%
  st_make_grid(cellsize = 15000, what = "polygons") %>%
  st_as_sf() %>%
  mutate(id = 1:n()) %>%
  select(id, geometry = x) %>%
  saveRDS("raw_data/interpolate/ca_fishnet.rds")

# Import burn rate data
shapes <- read_rds("shapes/csub.rds") %>%
  filter(str_sub(geoid, 1,5) %in% read_rds("raw_data/glass_fire_counties.rds")) %>%
  st_transform(crs = aea) %>%
  left_join(by = c("geoid" = "csub"), 
            y = read_rds("raw_data/ca_dataset.rds") %>%
              select(csub, week, burn_rate))

#read_rds("ca_dataset.rds") %>% 
#  select(csub, week, burn_rate, health_care_capacity, health_quality, 
#    employee_muni, democrat_2016)

# Import fishnet grid
nodes <- read_rds("raw_data/interpolate/ca_fishnet.rds")

interpolate_fire = function(WEEK){
  print(WEEK)
  # Make a grid of shapes from the polygon
  gstat::gstat(id = "id", formula = burn_rate ~ 1, 
               data = shapes %>%
                 filter(week == WEEK) %>%
                 na.omit() %>%
                 as("Spatial"),
               set = list(idp = 2)) %>%
    # Generate predictions
    predict(nodes) %>%
    dplyr::select(predicted = id.pred, geometry) %>%
    # Add in a unique id from the original grid cells
    mutate(id = nodes$id) %>%
    mutate(week = WEEK) %>%
    return()
}


data.frame(week = shapes$week %>% unique()) %>%
  split(.$week) %>%
  map(~interpolate_fire(.$week), .id = "week") %>%
  dplyr::bind_rows() %>%
  saveRDS("raw_data/interpolate/ca_burn_rate.rds")


# Repeat for workplace mobility
shapes <- read_rds("shapes/csub.rds") %>%
  filter(str_sub(geoid, 1,5) %in% read_rds("raw_data/glass_fire_counties.rds")) %>%
  st_transform(crs = aea) %>%
  left_join(by = c("geoid" = "csub"), 
            y = read_rds("raw_data/ca_dataset.rds") %>%
              select(csub, week, workplaces))

interpolate_workplaces = function(WEEK){
  print(WEEK)
  # Make a grid of shapes from the polygon
  gstat::gstat(id = "id", formula = workplaces ~ 1, 
               data = shapes %>%
                 filter(week == WEEK) %>%
                 na.omit() %>%
                 as("Spatial"),
               set = list(idp = 2)) %>%
    # Generate predictions
    predict(nodes) %>%
    dplyr::select(predicted = id.pred, geometry) %>%
    # Add in a unique id from the original grid cells
    mutate(id = nodes$id) %>%
    mutate(week = WEEK) %>%
    return()
}

data.frame(week = shapes$week %>% unique()) %>%
  split(.$week) %>%
  map(~interpolate_workplaces(.$week), .id = "week") %>%
  dplyr::bind_rows() %>%
  saveRDS("raw_data/interpolate/ca_workplaces.rds")




# Import ordinary time-invariant covariate data
shapes <- read_rds("shapes/csub.rds") %>%
  filter(str_sub(geoid, 1,5) %in% read_rds("raw_data/glass_fire_counties.rds")) %>%
  st_transform(crs = aea) %>%
  left_join(by = c("geoid" = "csub"), 
            y = read_rds("raw_data/ca_dataset.rds") %>%
              select(csub, health_care_capacity, health_quality, 
                     employee_muni, democrat_2016) %>%
              distinct())


# Next, let's write a quick function to get time-invariant traits too.
interpolate_trait = function(VARNAME){
  print(VARNAME)
  # Make a grid of shapes from the polygon
  gstat::gstat(id = "id", formula = variable ~ 1, 
               data = shapes %>%
                 rename(variable = VARNAME) %>%
                 na.omit() %>%
                 as("Spatial"),
               set = list(idp = 2)) %>%
    # Generate predictions
    predict(nodes) %>%
    dplyr::select(predicted = id.pred, geometry) %>%
    # Add in a unique id from the original grid cells
    mutate(id = nodes$id) %>%
    mutate(variable = VARNAME) %>%
    return()
}

data.frame(variable = c("health_care_capacity", "health_quality", 
                     "employee_muni", "democrat_2016")) %>%
  split(.$variable) %>%
  map(~interpolate_trait(.$variable), .id = "variable") %>%
  dplyr::bind_rows() %>%
  saveRDS("raw_data/interpolate/ca_covariates.rds")


# Take original county subdivision polygons
dat <- shapes %>% 
  select(geoid, geometry) %>%
  st_transform(crs= aea) %>%
  # Spatially join in grid cells
  st_join(read_rds("raw_data/interpolate/ca_fishnet.rds") %>%
            st_transform(crs = aea)) %>%
  # Left-join data
  left_join(by = c("id"),
            y = read_rds("raw_data/interpolate/ca_burn_rate.rds") %>%
            as.data.frame() %>%
            select(id, week, burn_rate = predicted)) %>%
  left_join(by = c("id", "week"), 
            y = read_rds("raw_data/interpolate/ca_workplaces.rds") %>%
            as.data.frame() %>%
            select(id, week, workplaces = predicted)) %>%
  left_join(by = "id",
            y = read_rds("raw_data/interpolate/ca_covariates.rds") %>%
                        as.data.frame() %>%
                        select(-geometry) %>%
                        pivot_wider(id_cols = id, 
                                    names_from = variable, 
                                    values_from = predicted)) %>%
  as.data.frame() %>%
  # Average across grid cells
  group_by(geoid, week) %>%
  summarize_at(vars(burn_rate, workplaces, 
                    health_care_capacity, health_quality, 
                     employee_muni, democrat_2016),
               funs(mean(., na.rm = TRUE))) %>%
  ungroup() %>%
  rename(burn_rate_int = burn_rate,
         workplaces_int = workplaces,
         health_care_capacity_int = health_care_capacity,
         health_quality_int = health_quality,
         employee_muni_int = employee_muni,
         democrat_2016_int = democrat_2016)

# Update california dataset with interpolated covariates
read_rds("raw_data/ca_dataset.rds") %>%
  left_join(by = c("csub" = "geoid", "week"), y = dat) %>%
  saveRDS("raw_data/ca_dataset.rds")


# To deal with lagged data, filter out first two weeks of august
read_rds("raw_data/ca_dataset.rds") %>%
  # Also, create a lagged case rate
  group_by(csub) %>%
  mutate(case_rate_lag2 = lead(case_rate, 2)) %>%
  ungroup() %>%
  filter(week >= "2020-08-17") %>%
      # Join in weather data  
  left_join(by = c("csub" = "geoid",  "week"),
          y = read_csv("raw_data/temperature/weather_weekly.csv")) %>%
  saveRDS("raw_data/ca_dataset.rds")

rm(list = ls())

# Now get the data for sensitivity tests!
read_rds("raw_data/ca_dataset.rds") %>%
  # Also, create a lagged case rate
  group_by(csub) %>%
  mutate(rate1 = lead(case_rate, 1),
         rate3 = lag(case_rate, 1),
         rate4 = lag(case_rate, 2),
         rate5 = lag(case_rate, 3)) %>%
  ungroup() %>%
 saveRDS("raw_data/ca_dataset.rds")

```

## Louisiana

```{r}


# Get Albers Equal Area Conic Projection
#https://spatialreference.org/ref/esri/north-america-albers-equal-area-conic/
aea <- "+proj=aea +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"


# Generate another fishnet grid for Louisiana
read_rds("shapes/csub.rds") %>%
  filter(str_sub(geoid, 1,5) %in% read_rds("raw_data/hurricane_zeta_counties.rds")) %>%
  st_transform(crs = aea) %>%
  st_make_grid(cellsize = 15000, what = "polygons") %>%
  st_as_sf() %>%
  mutate(id = 1:n()) %>%
  select(id, geometry = x) %>%
  saveRDS("raw_data/interpolate/la_fishnet.rds")


# Import fishnet grid
nodes <- read_rds("raw_data/interpolate/la_fishnet.rds")


# Repeat for workplace mobility
shapes <- read_rds("shapes/csub.rds") %>%
  filter(str_sub(geoid, 1,5) %in% read_rds("raw_data/hurricane_zeta_counties.rds")) %>%
  st_transform(crs = aea) %>%
  left_join(by = c("geoid" = "csub"), 
            y = read_rds("raw_data/la_dataset.rds") %>%
              select(csub, week, workplaces))

interpolate_workplaces = function(WEEK){
  print(WEEK)
  # Make a grid of shapes from the polygon
  gstat::gstat(id = "id", formula = workplaces ~ 1, 
               data = shapes %>%
                 filter(week == WEEK) %>%
                 na.omit() %>%
                 as("Spatial"),
               set = list(idp = 2)) %>%
    # Generate predictions
    predict(nodes) %>%
    dplyr::select(predicted = id.pred, geometry) %>%
    # Add in a unique id from the original grid cells
    mutate(id = nodes$id) %>%
    mutate(week = WEEK) %>%
    return()
}

data.frame(week = shapes$week %>% unique()) %>%
  split(.$week) %>%
  map(~interpolate_workplaces(.$week), .id = "week") %>%
  dplyr::bind_rows() %>%
  saveRDS("raw_data/interpolate/la_workplaces.rds")



# Import ordinary time-invariant covariate data
shapes <- read_rds("shapes/csub.rds") %>%
  filter(str_sub(geoid, 1,5) %in% read_rds("raw_data/hurricane_zeta_counties.rds")) %>%
  st_transform(crs = aea) %>%
  left_join(by = c("geoid" = "csub"), 
            y = read_rds("raw_data/la_dataset.rds") %>%
              select(csub, health_care_capacity, health_quality, 
                     employee_muni, democrat_2016) %>%
              distinct())

# Next, let's write a quick function to get time-invariant traits too.
interpolate_trait = function(VARNAME){
  print(VARNAME)
  # Make a grid of shapes from the polygon
  gstat::gstat(id = "id", formula = variable ~ 1, 
               data = shapes %>%
                 rename(variable = VARNAME) %>%
                 na.omit() %>%
                 as("Spatial"),
               set = list(idp = 2)) %>%
    # Generate predictions
    predict(nodes) %>%
    dplyr::select(predicted = id.pred, geometry) %>%
    # Add in a unique id from the original grid cells
    mutate(id = nodes$id) %>%
    mutate(variable = VARNAME) %>%
    return()
}

data.frame(variable = c("health_care_capacity", "health_quality", 
                     "employee_muni", "democrat_2016")) %>%
  split(.$variable) %>%
  map(~interpolate_trait(.$variable), .id = "variable") %>%
  dplyr::bind_rows() %>%
  saveRDS("raw_data/interpolate/la_covariates.rds")



# Take original county subdivision polygons
dat <- shapes %>% 
  select(geoid, geometry) %>%
  st_transform(crs= aea) %>%
  # Spatially join in grid cells
  st_join(read_rds("raw_data/interpolate/la_fishnet.rds") %>%
            st_transform(crs = aea)) %>%
  # Left-join data
  left_join(by = c("id"), 
            y = read_rds("raw_data/interpolate/la_workplaces.rds") %>%
            as.data.frame() %>%
            select(id, week, workplaces = predicted)) %>%
  left_join(by = "id",
            y = read_rds("raw_data/interpolate/la_covariates.rds") %>%
                        as.data.frame() %>%
                        select(-geometry) %>%
                        pivot_wider(id_cols = id, 
                                    names_from = variable, 
                                    values_from = predicted)) %>%
  as.data.frame() %>%
  # Average across grid cells
  group_by(geoid, week) %>%
  summarize_at(vars(workplaces, 
                    health_care_capacity, health_quality, 
                     employee_muni, democrat_2016),
               funs(mean(., na.rm = TRUE))) %>%
  ungroup() %>%
  rename(workplaces_int = workplaces,
         health_care_capacity_int = health_care_capacity,
         health_quality_int = health_quality,
         employee_muni_int = employee_muni,
         democrat_2016_int = democrat_2016)


# Update louisiana dataset with interpolated covariates
read_rds("raw_data/la_dataset.rds") %>%
  left_join(by = c("csub" = "geoid", "week"), y = dat) %>%
  saveRDS("raw_data/la_dataset.rds")




read_rds("raw_data/la_dataset.rds") %>%
  # Also, create a lagged case rate
  group_by(csub) %>%
  mutate(positivity_rate_lag2 = lead(positivity_rate, 2)) %>%
  ungroup() %>%
  filter(week >= "2020-08-19") %>%
  # Join in weather data  
  left_join(by = c("csub" = "geoid",  "week"),
          y = read_csv("raw_data/temperature/weather_weekly.csv")) %>%
  saveRDS("raw_data/la_dataset.rds")



# I want to run sensitivity tests,
# normally we test, what's the effect of a disaster two weeks earlier on the outcome 2 weeks later.
# I have many variables that were time lagged.
# I'd rather just bump the positivity rate, not those

# Situation 1:
# On August 26, postivity rate of 4.8%, avg. rainfall on August 11 of 60.4
# On Sept 2, positivity rate of 11.2%, avg. rainfall on August 26 of 300.1

# What I really want to do is say,
# Hey, let's imagine, what would be the positivity rate (14.6) one week later on August 19 IF average rainfall on August 11 were 60.4?

# To do this, we need to use 'lead' 1, which will bring last week's entry up by one week.

# To do this, we need to use 'lag' 1, which will bring the  positivity rate entry three weeks after the avg_rainfall.

library(tidyverse)
read_rds("raw_data/la_dataset.rds") %>% 
  # Also, create a lagged case rate
  group_by(csub) %>%
  mutate(rate1 = dplyr::lead(positivity_rate, 1),
         rate3 = lag(positivity_rate, 1),
         rate4 = lag(positivity_rate, 2),
         rate5 = lag(positivity_rate, 3)) %>%
  ungroup() %>%
  saveRDS("raw_data/la_dataset.rds")

rm(list = ls())


```







